The summary statistics and respective plots along with eSet creation is in 00_summstats_eset.{Rmd, html}, imputation with random forest is in 01_pml_imp_summary.{Rmd, html}, DimRed analysis is in 02_dimred_hvg.{Rmd, html}. This file contains differential expression analysis using DESeq2 along with significant marker info. The analysis is performed with age, sex and smoking status as covariates with histopathological group. This file contains GSVA analysis from signatures coming from relevant datasets along with hypeR GSEA from the signatures collected from differential anal results.
library(ggplot2)
library(stringr)
library(DT)
library(plyr)
library(dplyr)
library(biomaRt)
library(Biobase)
library(reshape2)
library(formattable)
library(VennDiagram)
library(hypeR)
library(xlsx)
library(DESeq2)
library(gridExtra)
library(ggsci)
PATH <- getwd()
eset <- readRDS(file.path(PATH, "data/2021_08_20_eset_imputed.RDS"))
eSet_wo_infl <- eset[, which(pData(eset)$Class != 'Inflammatory')]
table(eSet_wo_infl$Class)
##
## Cancer Control Dysplasia HkNR
## 9 18 22 17
eSet_wo_infl$Class <- recode(eSet_wo_infl$Class, "Cancer"="OSCC")
eSet_wo_infl$Class <- factor(eSet_wo_infl$Class, levels = c("Control", "HkNR", "Dysplasia", "OSCC"))
cpm_eset <- eSet_wo_infl
exprs(cpm_eset) <- apply(exprs(cpm_eset), 2, function(x) {x/(sum(x)/1000000)})
print(dim(cpm_eset))
## Features Samples
## 21510 66
cpm_eset$Class <- recode(cpm_eset$Class, "Control"="1-Control", "HkNR"="2-HkNR", "Dysplasia"="3-Dysplasia", "OSCC"="4-OSCC")
cpm_eset$Class <- factor(cpm_eset$Class, levels = c("1-Control", "2-HkNR", "3-Dysplasia", "4-OSCC"))
list_signs <- readRDS(file.path(PATH, "results/06_22_pml_signatures_w_sex_smoke_logFC1.5_fdr0.05.RDS"))
opmd <- list(up=c("SPRR2B", "DLX2", "SPRR2C", "CERS1", "CKB", "CYP19A1", "CARMIL3", "H2AC14",
"TUBA1B"),
down=c("IER3", "NGEF", "TUBA4A", "ACP6", "SPIDR", "CD46", "GNPTAB", "LCA5", "ZMAT1",
"SLC9A9", "ZNF204P", "PTCHD1", "FAM46A", "LGR5", "MUC1", "COLCA2", "DM1-AS", "ZNF418", 'NECTIN3', "MLPH",
"CCDC129", "TFCP2L1", "ATP6V1B1", "CRACR2B", "ERN2", "UGT1A6", "TLX1", "MUC16"))
gsva_res_opmd <- as.data.frame(t(GSVA::gsva(exprs(cpm_eset), opmd, verbose=FALSE)))
gsva_res_opmd$diff <- gsva_res_opmd$up-gsva_res_opmd$down
gsva_res_opmd$Class <- cpm_eset$Class[match(rownames(gsva_res_opmd), colnames(cpm_eset))]
opmd <- ggplot2::ggplot(gsva_res_opmd, ggplot2::aes_string(
x = 'Class', y = 'diff', fill='Class')) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
labs(y = "GSVA Score", x = "Type")+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"))
opmd
tcga <- readRDS("~/Documents/Research/pml/tcga_sigs.rds")
tcga_res <- GSVA::gsva(expr = exprs(cpm_eset), gset.idx.list = tcga)
## Estimating GSVA scores for 2 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
|=================================== | 50%
|
|======================================================================| 100%
tcga_res_df <- as.data.frame(t(tcga_res))
tcga_res_df$diff <- tcga_res_df$up - tcga_res_df$down
#gsvaViolinplot(gsvaData = t(tcga_res_df), textsize = 10, eset = cpm_eset, title = 'TCGA')
tcga_res_df$Class <- cpm_eset$Class[match(rownames(tcga_res_df), colnames(cpm_eset))]
up <- ggplot2::ggplot(tcga_res_df, ggplot2::aes_string(
x = 'Class', y = 'up', fill='Class')) +
ggplot2::geom_boxplot() +
ggplot2::geom_jitter(size=0.3) +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
labs(y = "GSVA Score", x = "Type")+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"))
up
down <- ggplot2::ggplot(tcga_res_df, ggplot2::aes_string(
x = 'Class', y = 'down', fill='Class')) +
ggplot2::geom_boxplot() +
ggplot2::geom_jitter(size=0.3) +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
labs(y = "GSVA Score", x = "Type")+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"))
down
tcga <- ggplot2::ggplot(tcga_res_df, ggplot2::aes_string(
x = 'Class', y = 'diff', fill='Class')) +
ggplot2::geom_boxplot() +
ggplot2::geom_jitter(size=0.3) +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
labs(y = "GSVA Score", x = "Type")+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"))
tcga
hnsc_sign <- readRDS(file.path("~/Documents/Research/HNSC_CuratedGenesets.rds"))
hnsc_sign_epi <- hnsc_sign[names(hnsc_sign) %in% c("Cell_Cycle_PMID29198524", "pEMT_PMID29198524", "Epithelial_Differentiation_1_PMID29198524", "Epithelial_Differentiation_2_PMID29198524", "Stress_PMID29198524", "Hypoxia_PMID29198524")]
names(hnsc_sign_epi)<- sapply(names(hnsc_sign_epi), function(x) str_split(x, pattern = "_PMID")[[1]], USE.NAMES=FALSE)[1,]
names(hnsc_sign_epi) <- recode(names(hnsc_sign_epi), "Epithelial_Differentiation_1"="Epi. Diff. 1", "Epithelial_Differentiation_2"="Epi. Diff. 2")
gsva_res_epi <- as.data.frame(t(GSVA::gsva(exprs(cpm_eset), hnsc_sign_epi, verbose=FALSE)))
gsva_res_epi$Class <- cpm_eset$Class[match(rownames(gsva_res_epi), colnames(cpm_eset))]
pemt <- ggplot2::ggplot(gsva_res_epi, ggplot2::aes_string(
x = 'Class', y = 'pEMT', fill='Class')) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
labs(y = "GSVA Score", x = "Type")+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"))
pemt
J. Beane Bronchial PML
pml_genes <- readRDS("~/Downloads/JBeane_PML/combined_gene_set_symbols.rds")
#names(pml_genes) <- paste("module", seq_len(length(names(pml_genes))), sep = "")
names(pml_genes) <- paste(c("module6", "module1", "module5", "module8", "module3", "module2", "module9", "module7", "module4"), sep = "")
#selecting modules that show a positive trend
pml_genes_prolif <- pml_genes[c('module5','module9', 'module8')]
prolif_gsva <- GSVA::gsva(exprs(cpm_eset), pml_genes_prolif)
## Estimating GSVA scores for 3 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
|======================= | 33%
|
|=============================================== | 67%
|
|======================================================================| 100%
prolif_res_df <- as.data.frame(t(prolif_gsva))
prolif_res_df$Class <- cpm_eset$Class
prolif1 <- ggplot2::ggplot(prolif_res_df, ggplot2::aes_string(
x = 'Class', y = 'module5', fill='Class')) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
labs(y = "GSVA Score", x = "Type")+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"),legend.position = "none")
prolif1
prolif2 <- ggplot2::ggplot(prolif_res_df, ggplot2::aes_string(
x = 'Class', y = 'module9', fill='Class')) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
labs(y = "GSVA Score", x = "Type")+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"), legend.position = "none")
prolif2
prolif3 <- ggplot2::ggplot(prolif_res_df, ggplot2::aes_string(
x = 'Class', y = 'module8', fill='Class')) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
labs(y = "GSVA Score", x = "Type")+
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
theme_bw()+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1, margin=ggplot2::margin(5, b=10)),
axis.text = element_text(size = 10, family='Helvetica', color="#222222"),
axis.title = element_text(size = 12, family='Helvetica', color="#222222"),
legend.text = ggplot2::element_text(family='Helvetica', size=(10), color="#222222"),legend.position = "none")
prolif3
prog <- data.frame(t(exprs(cpm_eset)[c("PDGFRB", "COL1A1", "COL1A2", "COL3A1"),]))
gsvaResT <- prog
condition <- 'Class'
cond <- apply(pData(cpm_eset)[, "Class",drop = FALSE],1, paste, collapse = "_")
gsvaResT[, paste(condition, collapse = "_")] <- cond
#gsvaResT$Class <- dplyr::recode_factor(gsvaResT$Class, 'Control'='1-Control', 'HkNR'='2-HkNR', 'Dysplasia'='3-Dysplasia', 'Cancer'='4-Cancer')
gsvaResFlat <- reshape2::melt(gsvaResT, id.vars = paste(condition, collapse = "_"), variable.name = "pathway")
g1 <- ggplot2::ggplot(gsvaResFlat, ggplot2::aes_string(
x = paste(condition, collapse = "_"), y = "value",
color = paste(condition, collapse = "_"))) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
ggplot2::facet_wrap(~pathway, scale = "free_y",
ncol = nrow(prog)) +
#ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = 10))+
ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = 'Fibroblast markers', y = "Counts(CPM)", x = "Class")+
theme(axis.text = element_text(size = 10), axis.title = element_text(size = 10, face = "bold"))
g1
pdgfb <- xlsx::read.xlsx(file = file.path(PATH, "data/pdgfb_kartha_plosone_2016.xlsx"), sheetIndex = 1)
pdgfb_genes <- pdgfb %>% dplyr::filter(Pearson.r >= 0.80) %>% dplyr::select(Gene_ID)
pdgfb_sig <- list("pdgfb"=vapply(strsplit(pdgfb_genes$Gene_ID, "|", fixed = TRUE), "[", "", 1))
pdgfb_gsva_res1 <- GSVA::gsva(expr = exprs(cpm_eset), gset.idx.list = pdgfb_sig)
## Estimating GSVA scores for 1 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
|======================================================================| 100%
gsvaViolinplot <- function(gsvaData, textsize, eset, title) {
gsvaResT <- data.frame(t(gsvaData))
condition <- 'Class'
cond <- apply(pData(cpm_eset)[, "Class",drop = FALSE],1, paste, collapse = "_")
gsvaResT[, paste(condition, collapse = "_")] <- cond
gsvaResFlat <- reshape2::melt(gsvaResT, id.vars = paste(condition,
collapse = "_"),
variable.name = "pathway")
ggplot2::ggplot(gsvaResFlat, ggplot2::aes_string(
x = paste(condition, collapse = "_"), y = "value",
color = paste(condition, collapse = "_"))) +
ggplot2::geom_boxplot() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
ggplot2::facet_wrap(~pathway, scale = "free_y",
ncol = ceiling(sqrt(nrow(gsvaData)))) +
ggpubr::stat_compare_means(method = "anova", label = 'p.format')+
ggplot2::theme(strip.text.x = ggplot2::element_text(size = textsize))+
ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(title = title, y = "GSVA Score", x = "Class")+
theme(axis.text = element_text(size = 10), axis.title = element_text(size = 10, face = "bold"))
}
gsvaViolinplot(gsvaData = pdgfb_gsva_res1, textsize = 8,eset = cpm_eset, title = "PDGFRB sigs")
hnsc_sign <- readRDS(file.path(PATH, "data/HNSC_CuratedGenesets.rds"))
hnsc_sign_epi <- hnsc_sign[names(hnsc_sign) %in% c("Cell_Cycle_PMID29198524", "pEMT_PMID29198524", "Epithelial_Differentiation_1_PMID29198524", "Epithelial_Differentiation_2_PMID29198524", "Stress_PMID29198524", "Hypoxia_PMID29198524")]
names(hnsc_sign_epi)<- sapply(names(hnsc_sign_epi), function(x) str_split(x, pattern = "_PMID")[[1]], USE.NAMES=FALSE)[1,]
hnsc_pemt <- hnsc_sign_epi['pEMT']
gsva_pemt <- GSVA::gsva(expr = exprs(eSet_wo_infl), gset.idx.list = hnsc_pemt)
## Estimating GSVA scores for 1 gene sets.
## Estimating ECDFs with Gaussian kernels
##
|
| | 0%
|
|======================================================================| 100%
gsvaViolinplot(gsvaData = gsva_pemt, textsize = 8, eset = cpm_eset, title = "pEMT sigs")
df_plasticity <- data.frame(t(gsva_pemt), t(pdgfb_gsva_res1), Class=eSet_wo_infl$Class)
df_plasticity$Class <- factor(df_plasticity$Class, levels = c("Control", "HkNR", "Dysplasia", "OSCC"))
ggplot(df_plasticity,aes(pEMT, pdgfb)) +
stat_summary(fun.data=mean_cl_normal) +
geom_smooth(method='lm', formula= y~x, ) +
labs(y = "PDGFRb GSVA scores", x = "pEMT GSVA scores")
## Warning: Removed 66 rows containing missing values (geom_segment).
sp1 <- ggpubr::ggscatter(df_plasticity, x = "pEMT", y = "pdgfb",
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE # Add confidence interval
)
# Add correlation coefficient
sp1 + ggpubr::stat_cor(method = "pearson") +
labs(y = "PDGFRb GSVA scores", x = "pEMT GSVA scores")
## `geom_smooth()` using formula 'y ~ x'
sp2 <- ggpubr::ggscatter(df_plasticity, x = "pEMT", y = "pdgfb", col="Class",
add = "reg.line", # Add regressin line
add.params = list(color = "blue", fill = "lightgray"), # Customize reg. line
conf.int = TRUE # Add confidence interval
) +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3)
# Add correlation coefficient
sp2 +
ggpubr::stat_cor(method = "pearson") +
labs(y = "PDGFRb GSVA scores", x = "pEMT GSVA scores")
## `geom_smooth()` using formula 'y ~ x'
ggplot(df_plasticity,aes(pEMT, pdgfb, col=Class)) +
geom_point() +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
ggpubr::stat_cor(method = "pearson")+
labs(y = "PDGFRb signature", x = "pEMT signature")
ggplot(df_plasticity,aes(pEMT, pdgfb, col=Class)) +
stat_summary(fun.data=mean_cl_normal) +
geom_smooth(method='lm', formula= y~x) +
scale_color_npg(alpha = 0.8)+
scale_fill_npg(alpha = 0.7)+
ggplot2::geom_jitter(size=0.3) +
facet_wrap(~Class, ncol = 4) +
labs(y = "PDGFRb signature", x = "pEMT signature")
## Warning: Removed 18 rows containing missing values (geom_segment).
## Warning: Removed 17 rows containing missing values (geom_segment).
## Warning: Removed 22 rows containing missing values (geom_segment).
## Warning: Removed 9 rows containing missing values (geom_segment).
hypeR is run on BIOCARTA, KEGG, REACTOME separately and the .Rmd files for each is separate files. The results are stored in 0.Supporting Material>results>PML_Pathways in the GDrive folder.
with fdr = 0.1
rctbl_mhyp_2 <- function(mhyp,
show_emaps=FALSE,
show_hmaps=FALSE,
hyp_emap_args=list(top=25, val="fdr"),
hyp_hmap_args=list(top=25, val="fdr"),
searchable = TRUE, resizable = TRUE) {
mhyp.df <- data.frame(signature=names(mhyp$data),
size=sapply(mhyp$data, function(hyp) hyp$info[["Signature Size"]]),
enriched=sapply(mhyp$data, function(hyp) nrow(hyp$data)),
gsets=sapply(mhyp$data, function(hyp) hyp$info[["Genesets"]]),
bg=sapply(mhyp$data, function(hyp) hyp$info[["Background"]]))
tbl <- reactable(
mhyp.df,
showPageSizeOptions = FALSE,
onClick = "expand",
resizable = TRUE,
rownames = FALSE,
searchable = TRUE,
filterable = TRUE,
defaultColDef = colDef(headerClass="rctbl-outer-header"),
columns = list(signature = colDef(name="Signature", minWidth=300),
size = colDef(name="Signature Size"),
enriched = colDef(name="Enriched Genesets"),
gsets = colDef(name="Genesets"),
bg = colDef(name="Background")),
details = function(index) {
hyp <- mhyp$data[[index]]
rctbl_hyp(hyp, type="inner", show_emaps, show_hmaps, hyp_emap_args, hyp_hmap_args)
},
wrap = FALSE,
class = "rctbl-outer-tbl",
rowStyle = list(cursor="pointer")
)
htmltools::div(class="rctbl-outer-obj", tbl)
}
HALLMARK <- msigdb_gsets("Homo sapiens", "H", "")
names(HALLMARK$genesets) <- names(HALLMARK$genesets) %>% strsplit( "HALLMARK_" ) %>% sapply( tail, 1 )
lmultihyp1 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , HALLMARK, background = nrow(eSet_wo_infl))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
rctbl_mhyp(lmultihyp1)
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, top = 15, title = "All Pairwise")
#hyp_to_excel(lmultihyp1, file_path = "~/Documents/Research/pml/pml_wo_infl/0906_hyper.xlsx")
#for manuscript
p1 <- hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, top = 15) +
labs(y="", x="") + theme_bw() +
theme_bw() +
theme(axis.title.y=element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none")
p1
REACTOME <- msigdb_gsets(species="Homo sapiens", category="C2", subcategory="CP:REACTOME", clean = TRUE)
names(REACTOME$genesets) <- names(REACTOME$genesets) %>% strsplit( "REACTOME_" ) %>% sapply( tail, 1 )
lmultihyp2 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , REACTOME, background = rownames(eSet_wo_infl))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp2, fdr=0.1, merge = TRUE, title = "Pairwise w/ background=rownames(eset)")
rctbl_mhyp(lmultihyp2)
lmultihyp3 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , REACTOME, background = nrow(eSet_wo_infl))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp3, top = 15, fdr=0.1, merge = TRUE, title = "Pairwise w/ background=nrow(eset)")
rctbl_mhyp(lmultihyp3)
# write.xlsx(lmultihyp3$data$cancer.vs.control_up$as.data.frame(), file = "~/Documents/Research/pml/pml_wo_infl/08_30_host_hyper_reactome.xlsx", sheetName = "cancer.vs.control")
#
# for (i in 1: length(names(lmultihyp3$data))) {
# gc()
# write.xlsx(lmultihyp3$[names(lmultihyp3$data[i])], file = "~/Documents/Research/pml/pml_wo_infl/08_30_host_hyper_reactome.xlsx", sheetName = names(lmultihyp3$data[i]), append = T)
# }
Creating heirarchical genesets
genesets <- REACTOME$genesets
length(genesets)
## [1] 1604
Clustering
suppressPackageStartupMessages(library(hierarchicalSets))
suppressPackageStartupMessages(library(qdapTools))
mat <- genesets %>%
qdapTools::mtabulate() %>%
as.matrix() %>%
t() %>%
hierarchicalSets::format_sets()
## Warning in format_sets.matrix(.): Values above 1 set to 1
hierarchy <- hierarchicalSets::create_hierarchy(mat, intersectLimit=1)
print(hierarchy)
## A HierarchicalSet object
##
## Universe size: 10968
## Number of sets: 1604
## Number of independent clusters: 272
plot(hierarchy, type='intersectStack', showHierarchy=TRUE, label=FALSE)
plot(hierarchy, type='outlyingElements', quantiles=0.75, alpha=0.5, label=FALSE)
suppressPackageStartupMessages(library(dendextend))
suppressPackageStartupMessages(library(stringi))
find.trees <- function(d) {
subtrees <- dendextend::partition_leaves(d)
leaves <- subtrees[[1]]
find.paths <- function(leaf) {
which(sapply(subtrees, function(x) leaf %in% x))
}
paths <- lapply(leaves, find.paths)
edges <- data.frame(from=c(), to=c())
if (length(d) > 1) {
for (path in paths) {
for (i in seq(1, length(path)-1)) {
edges <- rbind(edges, data.frame(from=path[i], to=path[i+1]))
}
}
edges <- dplyr::distinct(edges)
edges$from <- paste0("N", edges$from)
edges$to <- paste0("N", edges$to)
}
names(subtrees) <- paste0("N", seq(1:length(subtrees)))
nodes <- data.frame(id=names(subtrees))
rownames(nodes) <- nodes$id
nodes$label <- ""
leaves <- sapply(subtrees, function(x) length(x) == 1)
nodes$label[leaves] <- sapply(subtrees[leaves], function(x) x[[1]])
nodes$id <- NULL
# Internal nodes will not have labels, so we can generate unique hash identifiers
ids <- stringi::stri_rand_strings(nrow(nodes), 32)
names(ids) <- rownames(nodes)
rownames(nodes) <- ids[rownames(nodes)]
edges$from <- ids[edges$from]
edges$to <- ids[edges$to]
return(list("edges"=edges, "nodes"=nodes))
}
trees <- lapply(hierarchy$clusters, find.trees)
length(trees)
## [1] 272
Nodes
nodes.all <- lapply(trees, function(x) x$nodes)
nodes <- do.call(rbind, nodes.all)
head(nodes)
## label
## p5VETLzYQ6PWveG4E620rKJyn2zDPBve
## 6ABbzO9BSzqiOahQRQhUHZvQpTXQ83U0 Mitochondrial Uncoupling
## AQD5xWNrCDeaxo6lkenVAaPiAKqhr0w1 The Fatty Acid Cycling Model
## LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx
## bD6wwQ05h9x9CUjvb7UQ14G1rYnrROjw Metallothioneins Bind Metals
## Kju3YoxlOkSBmoFnWn8SLilHyZIMSYT1 Response To Metal Ions
Edges
edges.all <- lapply(trees, function(x) x$edges)
edges <- data.frame(do.call(rbind, edges.all))
head(edges)
## from to
## 1 p5VETLzYQ6PWveG4E620rKJyn2zDPBve 6ABbzO9BSzqiOahQRQhUHZvQpTXQ83U0
## 2 p5VETLzYQ6PWveG4E620rKJyn2zDPBve AQD5xWNrCDeaxo6lkenVAaPiAKqhr0w1
## 3 LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx bD6wwQ05h9x9CUjvb7UQ14G1rYnrROjw
## 4 LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx Kju3YoxlOkSBmoFnWn8SLilHyZIMSYT1
## 5 MMnvHJgCPzLapA7X8Mh8UCSHbcKOnY64 CDcHCkJ6TLgrJ6ueV2NhiSDpt3H7AXMG
## 6 MMnvHJgCPzLapA7X8Mh8UCSHbcKOnY64 4VcZUUvnefndTN2YUuzgWgmzjIBMpTt3
Create the relational genesets object
#genesets <- hyperdb_rgsets("REACTOME", version="70.0")
rgsets_obj <- rgsets$new(genesets, nodes, edges, name="REACTOME", version=msigdb_version())
rgsets_obj
## REACTOME v7.4.1
##
## Genesets
##
## 2 Ltr Circle Formation (7)
## A Tetrasaccharide Linker Sequence Is Required For Gag Synthesis (26)
## Abacavir Metabolism (5)
## Abacavir Transmembrane Transport (5)
## Abacavir Transport And Metabolism (10)
## Abc Family Proteins Mediated Transport (136)
##
## Nodes
##
## label
## p5VETLzYQ6PWveG4E620rKJyn2zDPBve
## 6ABbzO9BSzqiOahQRQhUHZvQpTXQ83U0 Mitochondrial Uncoupling
## AQD5xWNrCDeaxo6lkenVAaPiAKqhr0w1 The Fatty Acid Cycling Model
## LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx
## bD6wwQ05h9x9CUjvb7UQ14G1rYnrROjw Metallothioneins Bind Metals
## Kju3YoxlOkSBmoFnWn8SLilHyZIMSYT1 Response To Metal Ions
## id length
## p5VETLzYQ6PWveG4E620rKJyn2zDPBve p5VETLzYQ6PWveG4E620rKJyn2zDPBve 6
## 6ABbzO9BSzqiOahQRQhUHZvQpTXQ83U0 6ABbzO9BSzqiOahQRQhUHZvQpTXQ83U0 6
## AQD5xWNrCDeaxo6lkenVAaPiAKqhr0w1 AQD5xWNrCDeaxo6lkenVAaPiAKqhr0w1 5
## LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx 14
## bD6wwQ05h9x9CUjvb7UQ14G1rYnrROjw bD6wwQ05h9x9CUjvb7UQ14G1rYnrROjw 11
## Kju3YoxlOkSBmoFnWn8SLilHyZIMSYT1 Kju3YoxlOkSBmoFnWn8SLilHyZIMSYT1 14
##
## Edges
##
## from to
## 1 p5VETLzYQ6PWveG4E620rKJyn2zDPBve 6ABbzO9BSzqiOahQRQhUHZvQpTXQ83U0
## 2 p5VETLzYQ6PWveG4E620rKJyn2zDPBve AQD5xWNrCDeaxo6lkenVAaPiAKqhr0w1
## 3 LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx bD6wwQ05h9x9CUjvb7UQ14G1rYnrROjw
## 4 LnRdoQxL5Bmg0ISQ3GCn7PnUGQKkRuAx Kju3YoxlOkSBmoFnWn8SLilHyZIMSYT1
## 5 MMnvHJgCPzLapA7X8Mh8UCSHbcKOnY64 CDcHCkJ6TLgrJ6ueV2NhiSDpt3H7AXMG
## 6 MMnvHJgCPzLapA7X8Mh8UCSHbcKOnY64 4VcZUUvnefndTN2YUuzgWgmzjIBMpTt3
hyp1 <- hypeR(c(list_signs[[1]], list_signs[[2]], list_signs[[3]]), rgsets_obj, background = rownames(exprs(eSet_wo_infl)))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(hyp1, fdr=0.1, top=20, merge = TRUE, title = "Pairwise w/ background=rownames(eset)", sizes = TRUE)
rctbl_mhyp(hyp1)
#this was used in the manuscript!!!
hyp2 <- hypeR(c(list_signs[[1]], list_signs[[2]], list_signs[[3]]), rgsets_obj, background = nrow(exprs(eSet_wo_infl)))
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
#hyp_to_excel(hyp2, file_path = "~/Documents/Research/pml/pml_wo_infl/0906_reactome.xlsx")
rctbl_mhyp(hyp2)
hyp_dots(hyp2, fdr=0.1, top=15, merge = TRUE, title = "Pairwise w/ background=nrow(eset)", sizes = TRUE)
hyp_hmap(hyp2, fdr=0.1, top=15)
## $cancer.vs.control_up
##
## $cancer.vs.control_down
##
## $dys.vs.control_up
##
## $dys.vs.control_down
##
## $hknr.vs.control_up
## NULL
##
## $hknr.vs.control_down
## NULL
##
## $dys.vs.cancer_up
##
## $dys.vs.cancer_down
## NULL
##
## $hknr.vs.cancer_up
##
## $hknr.vs.cancer_down
##
## $hknr.vs.dys_up
##
## $hknr.vs.dys_down
## NULL
#do hierarchical clust
.dots_multi_plot <- function(multihyp_data,
top=20,
abrv=50,
sizes=TRUE,
pval_cutoff=1,
fdr_cutoff=1,
val=c("fdr", "pval"),
title="") {
# Default arguments
val <- match.arg(val)
# Count significant genesets across signatures
multihyp_dfs <- lapply(multihyp_data, function(hyp_obj) {
hyp_obj$data %>%
dplyr::filter(pval <= pval_cutoff) %>%
dplyr::filter(fdr <= fdr_cutoff) %>%
dplyr::select(label)
})
# Take top genesets
labels <- names(sort(table(unlist(multihyp_dfs)), decreasing=TRUE))
if (!is.null(top)) labels <- head(labels, top)
# Handle empty dataframes
if (length(labels) == 0) return(ggempty())
# Create a multihyp dataframe
dfs <- lapply(multihyp_data, function(hyp_obj) {
hyp_df <- hyp_obj$data
hyp_df[hyp_df$label %in% labels, c("label", val), drop=FALSE]
})
df <- suppressWarnings(Reduce(function(x, y) merge(x, y, by="label", all=TRUE), dfs))
colnames(df) <- c("label", names(dfs))
rownames(df) <- df$label
df <- df[rev(labels), names(dfs)]
# Abbreviate labels
label.abrv <- substr(rownames(df), 1, abrv)
if (any(duplicated(label.abrv))) {
stop("Non-unique labels after abbreviating")
} else {
rownames(df) <- factor(label.abrv, levels=label.abrv)
}
if (val == "pval") {
cutoff <- pval_cutoff
color.label <- "P-Value"
}
if (val == "fdr") {
cutoff <- fdr_cutoff
color.label <- "FDR"
}
df.melted <- reshape2::melt(as.matrix(df))
colnames(df.melted) <- c("label", "signature", "significance")
df.melted$size <- if(sizes) df.melted$significance else 1
return(df.melted)
}
arrange the dotplot according to clustered groups
.reverselog_trans <- function(base=exp(1)) {
trans <- function(x) -log(x, base)
inv <- function(x) base^(-x)
scales::trans_new(paste0("reverselog-", format(base)), trans, inv,
scales::log_breaks(base=base),
domain=c(1e-100, Inf))
}
df <- .dots_multi_plot(hyp2$data, top = 15, abrv = 75, sizes = TRUE, fdr_cutoff = 0.1, pval_cutoff = 0.05, val = "fdr", title)
df <- df[df$significance <= 0.1, ]
#rownames(df) <- NULL
df %>%
ggplot(aes(x=signature, y=(factor(label)), color=significance, size=size)) +
geom_point() +
scale_color_continuous(low="#114357", high="#E53935", trans=.reverselog_trans(10)) +
scale_size_continuous(trans=.reverselog_trans(10), guide="none") +
theme_bw() +
ggplot2::theme(axis.text.x = element_text(angle = 45, hjust = 1))
label_ordered <- c("Extracellular Matrix Organization", "Activation Of Matrix Metalloproteinases", "Degradation Of The Extracellular Matrix", "Cytokine Signaling In Immune System", "Interleukin 4 And Interleukin 13 Signaling", "Anti Inflammatory Response Favouring Leishmania Parasite Infection", "Fcgr Activation", "Fcgr3a Mediated Il10 Synthesis", "Immunoregulatory Interactions Between A Lymphoid And A Non Lymphoid Cell", "Formation Of The Cornified Envelope", "Collagen Formation", "Collagen Degradation", "Biological Oxidations", "Fatty Acids", "Cytochrome P450 Arranged By Substrate Type")
df <- df[order(match(df$label, label_ordered)),]
df$label <- factor(df$label, levels = rev(label_ordered))
df %>%
ggplot(aes(x=signature, y=(factor(label)), color=significance, size=size)) +
geom_point() +
scale_color_continuous(low="#114357", high="#E53935", trans=.reverselog_trans(10)) +
scale_size_continuous(trans=.reverselog_trans(10), guide="none") + theme_bw()
# +theme(axis.title.y=element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none")
#for manuscript
p1 <- df %>%
ggplot(aes(x=signature, y=(factor(label)), color=significance, size=size)) +
geom_point() +
scale_color_continuous(low="#114357", high="#E53935", trans=.reverselog_trans(10)) +
scale_size_continuous(trans=.reverselog_trans(10), guide="none") + theme_bw() +
theme(axis.title.y=element_blank(), axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none")
p1
with fdr = 0.1
KEGG <- msigdb_gsets(species="Homo sapiens", category = 'C2', subcategory = 'CP:KEGG')
#enrichr_gsets("KEGG_2019_Human")
lmultihyp1 <- hypeR(list_signs[[1]], KEGG)
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, title = "PML with Controls")
rctbl_mhyp(lmultihyp1)
lmultihyp2 <- hypeR(list_signs[[2]], KEGG)
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
hyp_dots(lmultihyp2, fdr=0.1, merge = TRUE, title = "PML with Cancer")
rctbl_mhyp(lmultihyp2)
lmultihyp3 <- hypeR(list_signs[[3]], KEGG)
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp3, fdr=0.1, merge = TRUE, title = "Pairwise PML")
rctbl_mhyp(lmultihyp3)
with fdr = 0.1
GO <- msigdb_gsets("Homo sapiens", "C5", subcategory = 'CC')
names(GO$genesets) <- names(GO$genesets) %>% strsplit( "GOMF_" ) %>% sapply( tail, 1 )
length(unique(names(GO$genesets)))
## [1] 996
lmultihyp1 <- hypeR(list_signs[[1]], GO)
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, title = "PML with Controls")
rctbl_mhyp(lmultihyp1)
lmultihyp2 <- hypeR(list_signs[[2]], GO)
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
hyp_dots(lmultihyp2, fdr=0.1, merge = TRUE, title = "PML with Cancer")
rctbl_mhyp(lmultihyp2)
lmultihyp3 <- hypeR(list_signs[[3]], GO)
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp3, fdr=0.1, merge = TRUE, title = "Pairwise PML")
rctbl_mhyp(lmultihyp3)
lmultihyp1 <- hypeR(c(list_signs[[1]],list_signs[[2]], list_signs[[3]]) , GO, fdr = 0.1)
## cancer.vs.control_up
## cancer.vs.control_down
## dys.vs.control_up
## dys.vs.control_down
## hknr.vs.control_up
## hknr.vs.control_down
## dys.vs.cancer_up
## dys.vs.cancer_down
## hknr.vs.cancer_up
## hknr.vs.cancer_down
## hknr.vs.dys_up
## hknr.vs.dys_down
hyp_dots(lmultihyp1, fdr=0.1, merge = TRUE, top = 50, title = "All Pairwise", abrv = 50)
rctbl_mhyp(lmultihyp1)